package com.m3958.encode.detector.impl;

import java.nio.file.Path;

import com.m3958.encode.detector.AbstractDetector;
import com.m3958.encode.detector.Detector;
import com.m3958.encode.detector.LanguageName;

/*
 * 字节FF和FE在UTF-8编码中永远不会出现，因此他们可以用来表明UTF-16或UTF-32文本（见BOM）
 * https://en.wikipedia.org/wiki/UTF-8
 * Code points with lower numerical values (i.e., earlier code positions in the Unicode character set, which tend to occur more frequently) are encoded using fewer bytes.
 * The first 128 characters of Unicode, which correspond one-to-one with ASCII, are encoded using a single octet with the same binary value as ASCII, making valid ASCII text valid UTF-8-encoded Unicode as well. And ASCII bytes do not occur when encoding non-ASCII code points into UTF-8
 * 
 * Bits of
    code point  First
    code point  Last
    code point  Bytes in
    sequence    Byte 1  Byte 2  Byte 3  Byte 4  Byte 5  Byte 6
    7 U+0000  U+007F  1   0xxxxxxx
    11  U+0080  U+07FF  2   110xxxxx    10xxxxxx
    16  U+0800  U+FFFF  3   1110xxxx    10xxxxxx    10xxxxxx
    21  U+10000 U+1FFFFF    4   11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
    26  U+200000    U+3FFFFFF   5   111110xx    10xxxxxx    10xxxxxx    10xxxxxx    10xxxxxx
    31  U+4000000   U+7FFFFFFF  6   1111110x    10xxxxxx    10xxxxxx    10xxxxxx    10xxxxxx    10xxxxxx
    
    0开头的1byte
    11开头的2byte
    111开头的3byte
    1111

 */
public class UTF8 extends AbstractDetector implements Detector {

    protected static final byte mask1 = (byte) 0x80;
    protected static final byte mask2 = (byte) 0xC0;
    protected static final byte mask3 = (byte) 0xE0;
    protected static final byte mask4 = (byte) 0xF0;
    protected static final byte mask5 = (byte) 0xF8;
    protected static final byte mask6 = (byte) 0xFC;

    protected static final byte bhex80 = (byte) 0x80; // following byte should start with 10, this value: 0b1000_0000

    protected static final byte bom1 = (byte) 0xEF;
    protected static final byte bom2 = (byte) 0xBB;
    protected static final byte bom3 = (byte) 0xBF;

    public UTF8() {
    }

    public UTF8(byte[] bytes) {
        super(bytes);
    }

    public UTF8(Path path) {
        super(path);
    }

    // byte order marker.BOM 0xEF, 0xBB, 0xBF

    @Override
    protected int detectOne(byte... bytes) {
        int len = bytes.length;

        if (len > 6) { // it's not a utf-8 bytes.
            return -2;
        }

        byte b = bytes[0];

        if (b == bom1) {
            return 1;
        } else if (b == bom2) {
            return 1;
        } else if (b == bom3) {
            return 0;
        }

        if ((b & mask1) == 0x0) { // is ascii
            successedNumber++;
            asciiNumber++;
            return 0;
        }

        if (len == 1) {
            return 1;
        }

        if ((b & mask6) == mask6) {
            if (len != 6)
                return 1;
        } else if ((b & mask5) == mask5) {
            if (len != 5)
                return 1;
        } else if ((b & mask4) == mask4) {
            if (len != 4)
                return 1;
        } else if ((b & mask3) == mask3) {
            if (len != 3)
                return 1;
        } else if ((b & mask2) == mask2) {
            if (len != 2)
                return 1;
        }
        if (byteStartWith10(bytes)) {
            successedNumber++;
            charNumber++;
        } else {
            failedNumber++;
        }
        return 0;
    }

    private boolean byteStartWith10(byte[] bytes) {
        int i = 1;
        for (; i < bytes.length; i++) {
            if ((bytes[i] & mask2) != bhex80) {
                return false;
            }
        }
        return true;
    }

    @Override
    protected String getCharsetName() {
        return "UTF-8";
    }

    @Override
    public int maxCharBytes() {
        return 6;
    }

    /* (non-Javadoc)
     * @see com.m3958.encode.detector.AbstractDetector#getLanguageName()
     */
    @Override
    protected LanguageName getLanguageName() {
        return LanguageName.UTF;
    }

}
